1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.retriever;
28
29 import java.io.ByteArrayOutputStream;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.net.URL;
33 import org.apache.commons.httpclient.Header;
34 import org.apache.commons.httpclient.HostConfiguration;
35 import org.apache.commons.httpclient.HttpClient;
36 import org.apache.commons.httpclient.HttpConnectionManager;
37 import org.apache.commons.httpclient.HttpMethod;
38 import org.apache.commons.httpclient.HttpStatus;
39 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
40 import org.apache.commons.httpclient.cookie.CookiePolicy;
41 import org.apache.commons.httpclient.methods.PostMethod;
42 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
43 import org.apache.log4j.Logger;
44 import org.smartcrawler.common.AbstractParametrizableComponent;
45 import org.smartcrawler.common.Context;
46 import org.smartcrawler.common.Link;
47 import org.smartcrawler.common.MalformedLinkException;
48 import org.smartcrawler.common.SCLogger;
49 import org.smartcrawler.extractor.HtmlURL;
50 import org.smartcrawler.extractor.HtmlURLImpl;
51 import org.smartcrawler.extractor.LinkBuilderImpl;
52
53 /***
54 *
55 *
56 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
57 * @version <tt>$Revision: 1.2 $</tt>
58 */
59 public class MultiThreadHttpCallRetriever extends HttpCallRetriever implements Retriever {
60
61 protected HttpClient httpClient;
62 private static Logger log = SCLogger.getLogger(MultiThreadHttpCallRetriever.class);
63
64 /*** The max number of http connections per host. */
65 protected static final int DEFAULT_MAX_CONN_PER_HOST = 30;
66
67 /*** The max number of http connections. */
68 protected static final int DEFAULT_MAX_TOTAL_CONN = 30;
69
70 /*** The connection timeout. */
71 protected static final int DEFAULT_CONN_TIMEOUT = 10000;
72
73 /*** The SO connection timeout. */
74 protected static final int DEFAULT_SO_TIMEOUT = 10000;
75
76 /***
77 * Creates a new instance of HttpRetriever
78 * @param host
79 */
80 public MultiThreadHttpCallRetriever() {
81 this.httpClient = createHttpClient();
82 log.info("Created multiThread retriever");
83 }
84
85 protected HttpClient getHttpClient() {
86 return this.httpClient;
87 }
88
89 /***
90 * Method which creates the default httpClient
91 *
92 * @param isMultiThread
93 * @return
94 */
95 protected HttpClient createHttpClient() {
96 log.debug("createHttpClient: BEGIN");
97
98 HttpConnectionManager connMan = null;
99 connMan = new MultiThreadedHttpConnectionManager();
100 HttpConnectionManagerParams par = new HttpConnectionManagerParams();
101 par.setDefaultMaxConnectionsPerHost(DEFAULT_MAX_CONN_PER_HOST);
102 par.setMaxTotalConnections(DEFAULT_MAX_TOTAL_CONN);
103 par.setConnectionTimeout(DEFAULT_CONN_TIMEOUT);
104 par.setSoTimeout(DEFAULT_SO_TIMEOUT);
105 connMan.setParams(par);
106 log.debug("createHttpClient: END");
107 HttpClient client = new HttpClient(connMan);
108 client.getState().setCookiePolicy(CookiePolicy.COMPATIBILITY);
109 return client;
110 }
111 }